// Copyright 2018 ETH Zurich and University of Bologna.
// Copyright and related rights are licensed under the Solderpad Hardware
// License, Version 0.51 (the "License"); you may not use this file except in
// compliance with the License.  You may obtain a copy of the License at
// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
// or agreed to in writing, software, hardware and materials distributed under
// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.
//
// Author: Michael Schaffner <schaffner@iis.ee.ethz.ch>, ETH Zurich
// Date: 15.08.2018
// Description: Instruction cache that is compatible with openpiton.
//
// Some notes:
//
// 1) refills always have the size of one cache line, except for accesses to the I/O region, which is mapped
//    to the top half of the physical address space (bit 39 = 1). the data width of the interface has the width
//    of one cache line, and hence the ifills can be transferred in a single cycle. note that the ifills must be
//    consumed unconditionally.
//
// 2) instruction fetches are always assumed to be aligned to 32bit (lower 2 bits are ignored)
//
// 3) NC accesses to I/O space are expected to return 32bit from memory.
//

import ariane_pkg::*;
import wt_cache_pkg::*;

module wt_icache  #(
  parameter logic [CACHE_ID_WIDTH-1:0]  RdTxId             = 0,                                  // ID to be used for read transactions
  parameter ariane_pkg::ariane_cfg_t    ArianeCfg          = ariane_pkg::ArianeDefaultConfig     // contains cacheable regions
) (
  input  logic                      clk_i,
  input  logic                      rst_ni,

  input  logic                      flush_i,              // flush the icache, flush and kill have to be asserted together
  input  logic                      en_i,                 // enable icache
  output logic                      miss_o,               // to performance counter
  // address translation requests
  input  icache_areq_i_t            areq_i,
  output icache_areq_o_t            areq_o,
  // data requests
  input  icache_dreq_i_t            dreq_i,
  output icache_dreq_o_t            dreq_o,
  // refill port
  input  logic                      mem_rtrn_vld_i,
  input  icache_rtrn_t              mem_rtrn_i,
  output logic                      mem_data_req_o,
  input  logic                      mem_data_ack_i,
  output icache_req_t               mem_data_o
);

  // signals
  logic                                 cache_en_d, cache_en_q;       // cache is enabled
  logic [riscv::VLEN-1:0]               vaddr_d, vaddr_q;
  logic                                 paddr_is_nc;                  // asserted if physical address is non-cacheable
  logic [ICACHE_SET_ASSOC-1:0]          cl_hit;                       // hit from tag compare
  logic                                 cache_rden;                   // triggers cache lookup
  logic                                 cache_wren;                   // triggers write to cacheline
  logic                                 cmp_en_d, cmp_en_q;           // enable tag comparison in next cycle. used to cut long path due to NC signal.
  logic                                 flush_d, flush_q;             // used to register and signal pending flushes

  // replacement strategy
  logic                                 update_lfsr;                  // shift the LFSR
  logic [$clog2(ICACHE_SET_ASSOC)-1:0]  inv_way;                      // first non-valid encountered
  logic [$clog2(ICACHE_SET_ASSOC)-1:0]  rnd_way;                      // random index for replacement
  logic [$clog2(ICACHE_SET_ASSOC)-1:0]  repl_way;                     // way to replace
  logic [ICACHE_SET_ASSOC-1:0]          repl_way_oh_d, repl_way_oh_q; // way to replace (onehot)
  logic                                 all_ways_valid;               // we need to switch repl strategy since all are valid

  // invalidations / flushing
  logic                                 inv_en;                       // incoming invalidations
  logic                                 flush_en, flush_done;         // used to flush cache entries
  logic [ICACHE_CL_IDX_WIDTH-1:0]       flush_cnt_d, flush_cnt_q;     // used to flush cache entries

  // mem arrays
  logic                                 cl_we;                        // write enable to memory array
  logic [ICACHE_SET_ASSOC-1:0]          cl_req;                       // request to memory array
  logic [ICACHE_CL_IDX_WIDTH-1:0]       cl_index;                     // this is a cache-line index, to memory array
  logic [ICACHE_OFFSET_WIDTH-1:0]       cl_offset_d, cl_offset_q;     // offset in cache line
  logic [ICACHE_TAG_WIDTH-1:0]          cl_tag_d, cl_tag_q;           // this is the cache tag
  logic [ICACHE_TAG_WIDTH-1:0]          cl_tag_rdata [ICACHE_SET_ASSOC-1:0]; // these are the tags coming from the tagmem
  logic [ICACHE_LINE_WIDTH-1:0]         cl_rdata     [ICACHE_SET_ASSOC-1:0]; // these are the cachelines coming from the cache
  logic [ICACHE_SET_ASSOC-1:0][FETCH_WIDTH-1:0]cl_sel;                // selected word from each cacheline
  logic [ICACHE_SET_ASSOC-1:0]          vld_req;                      // bit enable for valid regs
  logic                                 vld_we;                       // valid bits write enable
  logic [ICACHE_SET_ASSOC-1:0]          vld_wdata;                    // valid bits to write
  logic [ICACHE_SET_ASSOC-1:0]          vld_rdata;                    // valid bits coming from valid regs
  logic [ICACHE_CL_IDX_WIDTH-1:0]       vld_addr;                     // valid bit

  // cpmtroller FSM
  typedef enum logic[2:0] {FLUSH, IDLE, READ, MISS, TLB_MISS, KILL_ATRANS, KILL_MISS} state_e;
  state_e state_d, state_q;

///////////////////////////////////////////////////////
// address -> cl_index mapping, interface plumbing
///////////////////////////////////////////////////////

  // extract tag from physical address, check if NC
  assign cl_tag_d  = (areq_i.fetch_valid) ? areq_i.fetch_paddr[ICACHE_TAG_WIDTH+ICACHE_INDEX_WIDTH-1:ICACHE_INDEX_WIDTH] : cl_tag_q;

  // noncacheable if request goes to I/O space, or if cache is disabled
  assign paddr_is_nc = (~cache_en_q) | (~ariane_pkg::is_inside_cacheable_regions(ArianeCfg, {{64-ICACHE_TAG_WIDTH{1'b0}}, cl_tag_d, {ICACHE_INDEX_WIDTH{1'b0}}}));

  // pass exception through
  assign dreq_o.ex = areq_i.fetch_exception;

  // latch this in case we have to stall later on
  // make sure this is 32bit aligned
  assign vaddr_d = (dreq_o.ready & dreq_i.req) ? dreq_i.vaddr : vaddr_q;
  assign areq_o.fetch_vaddr = {vaddr_q>>2, 2'b0};

  // split virtual address into index and offset to address cache arrays
  assign cl_index    = vaddr_d[ICACHE_INDEX_WIDTH-1:ICACHE_OFFSET_WIDTH];


  if (ArianeCfg.Axi64BitCompliant) begin : gen_axi_offset
    // if we generate a noncacheable access, the word will be at offset 0 or 4 in the cl coming from memory
    assign cl_offset_d = ( dreq_o.ready & dreq_i.req)      ? {dreq_i.vaddr>>2, 2'b0} :
                         ( paddr_is_nc  & mem_data_req_o ) ? cl_offset_q[2]<<2 : // needed since we transfer 32bit over a 64bit AXI bus in this case
                                                             cl_offset_q;
    // request word address instead of cl address in case of NC access
    assign mem_data_o.paddr = (paddr_is_nc) ? {cl_tag_d, vaddr_q[ICACHE_INDEX_WIDTH-1:3], 3'b0} :                                         // align to 32bit
                                              {cl_tag_d, vaddr_q[ICACHE_INDEX_WIDTH-1:ICACHE_OFFSET_WIDTH], {ICACHE_OFFSET_WIDTH{1'b0}}}; // align to cl
end else begin : gen_piton_offset
    // icache fills are either cachelines or 4byte fills, depending on whether they go to the Piton I/O space or not.
    // since the piton cache system replicates the data, we can always index the full CL
    assign cl_offset_d = ( dreq_o.ready & dreq_i.req)      ? {dreq_i.vaddr>>2, 2'b0} :
                                                             cl_offset_q;

    // request word address instead of cl address in case of NC access
    assign mem_data_o.paddr = (paddr_is_nc) ? {cl_tag_d, vaddr_q[ICACHE_INDEX_WIDTH-1:2], 2'b0} :                                         // align to 32bit
                                              {cl_tag_d, vaddr_q[ICACHE_INDEX_WIDTH-1:ICACHE_OFFSET_WIDTH], {ICACHE_OFFSET_WIDTH{1'b0}}}; // align to cl
  end


  assign mem_data_o.tid   = RdTxId;

  assign mem_data_o.nc    = paddr_is_nc;
  // way that is being replaced
  assign mem_data_o.way   = repl_way;
  assign dreq_o.vaddr     = vaddr_q;

///////////////////////////////////////////////////////
// main control logic
///////////////////////////////////////////////////////

  always_comb begin : p_fsm
    // default assignment
    state_d      = state_q;
    cache_en_d   = cache_en_q & en_i;// disabling the cache is always possible, enable needs to go via flush
    flush_en     = 1'b0;
    cmp_en_d     = 1'b0;
    cache_rden   = 1'b0;
    cache_wren   = 1'b0;
    inv_en       = 1'b0;
    flush_d      = flush_q | flush_i; // register incoming flush

    // interfaces
    dreq_o.ready     = 1'b0;
    areq_o.fetch_req = 1'b0;
    dreq_o.valid     = 1'b0;
    mem_data_req_o   = 1'b0;
    // performance counter
    miss_o           = 1'b0;

    // handle invalidations unconditionally
    // note: invald are mutually exclusive with
    // ifills, since both arrive over the same IF
    // however, we need to make sure below that we
    // do not trigger a cache readout at the same time...
    if (mem_rtrn_vld_i && mem_rtrn_i.rtype == ICACHE_INV_REQ) begin
      inv_en = 1'b1;
    end

    unique case (state_q)
      //////////////////////////////////
      // this clears all valid bits
      FLUSH: begin
          flush_en = 1'b1;
        if (flush_done) begin
          state_d = IDLE;
          flush_d = 1'b0;
          // if the cache was not enabled set this
          cache_en_d = en_i;
        end
      end
      //////////////////////////////////
      // wait for an incoming request
      IDLE: begin
          // only enable tag comparison if cache is enabled
          cmp_en_d = cache_en_q;

          // handle pending flushes, or perform cache clear upon enable
          if (flush_d || (en_i && !cache_en_q)) begin
            state_d    = FLUSH;
          // wait for incoming requests
          end else begin
            // mem requests are for sure invals here
            if (!mem_rtrn_vld_i) begin
              dreq_o.ready = 1'b1;
              // we have a new request
              if (dreq_i.req) begin
                cache_rden       = 1'b1;
                state_d          = READ;
              end
            end
            if (dreq_i.kill_s1) begin
              state_d = IDLE;
            end
          end
      end
      //////////////////////////////////
      // check whether we have a hit
      // in case the cache is disabled,
      // or in case the address is NC, we
      // reuse the miss mechanism to handle
      // the request
      READ: begin
          state_d          = TLB_MISS;
          areq_o.fetch_req = '1;
          // only enable tag comparison if cache is enabled
          cmp_en_d    = cache_en_q;
          // readout speculatively
          cache_rden  = cache_en_q;

          if (areq_i.fetch_valid) begin
            // check if we have to flush
            if (flush_d) begin
              state_d  = IDLE;
            // we have a hit or an exception output valid result
            end else if ((|cl_hit && cache_en_q) || areq_i.fetch_exception.valid) begin
              dreq_o.valid     = ~dreq_i.kill_s2;// just don't output in this case
              state_d          = IDLE;

              // we can accept another request
              // and stay here, but only if no inval is coming in
              // note: we are not expecting ifill return packets here...
              if (!mem_rtrn_vld_i) begin
                dreq_o.ready     = 1'b1;
                if (dreq_i.req) begin
                  state_d          = READ;
                end
              end
              // if a request is being killed at this stage,
              // we have to bail out and wait for the address translation to complete
              if (dreq_i.kill_s1) begin
                state_d = IDLE;
              end
            // we have a miss / NC transaction
            end else if (dreq_i.kill_s2) begin
              state_d = IDLE;
            end else begin
              cmp_en_d = 1'b0;
              // only count this as a miss if the cache is enabled, and
              // the address is cacheable
              // send out ifill request
              mem_data_req_o = 1'b1;
              if (mem_data_ack_i) begin
                miss_o         = ~paddr_is_nc;
                state_d        = MISS;
              end
            end
          // bail out if this request is being killed (and we missed on the TLB)
          end else if (dreq_i.kill_s2 || flush_d) begin
            state_d  = KILL_ATRANS;
          end
      end
      //////////////////////////////////
      // wait until the memory transaction
      // returns. do not write to memory
      // if the nc bit is set.
      TLB_MISS: begin
        areq_o.fetch_req = '1;
        // only enable tag comparison if cache is enabled
        cmp_en_d = cache_en_q;
        // readout speculatively
        cache_rden = cache_en_q;

        if (areq_i.fetch_valid) begin
          // check if we have to kill this request
          if (dreq_i.kill_s2 | flush_d) begin
            state_d  = IDLE;
          // check whether we got an exception
          end else if (areq_i.fetch_exception.valid) begin
            dreq_o.valid = 1'b1;
            state_d      = IDLE;
          // re-trigger cache readout for tag comparison and cache line selection
          // but if we got an invalidation, we have to wait another cycle
          end else if (!mem_rtrn_vld_i) begin
            state_d          = READ;
          end
        // bail out if this request is being killed
        end else if (dreq_i.kill_s2 || flush_d) begin
          state_d  = KILL_ATRANS;
        end
      end
      //////////////////////////////////
      // wait until the memory transaction
      // returns. do not write to memory
      // if the nc bit is set.
      MISS: begin
        // note: this is mutually exclusive with ICACHE_INV_REQ,
        // so we do not have to check for invals here
        if (mem_rtrn_vld_i && mem_rtrn_i.rtype == ICACHE_IFILL_ACK) begin
          state_d      = IDLE;
          // only return data if request is not being killed
          if (!(dreq_i.kill_s2 || flush_d)) begin
            dreq_o.valid = 1'b1;
            // only write to cache if this address is cacheable
            cache_wren   = ~paddr_is_nc;
          end
        // bail out if this request is being killed
        end else if (dreq_i.kill_s2 || flush_d) begin
          state_d  = KILL_MISS;
        end
      end
      //////////////////////////////////
      // killed address translation,
      // wait until paddr is valid, and go
      // back to idle
      KILL_ATRANS: begin
        areq_o.fetch_req = '1;
        if (areq_i.fetch_valid) begin
          state_d      = IDLE;
        end
      end
      //////////////////////////////////
      // killed miss,
      // wait until memory responds and
      // go back to idle
      KILL_MISS: begin
        if (mem_rtrn_vld_i && mem_rtrn_i.rtype == ICACHE_IFILL_ACK) begin
          state_d      = IDLE;
        end
      end
      default: begin
        // we should never get here
        state_d = FLUSH;
      end
    endcase // state_q
  end

///////////////////////////////////////////////////////
// valid bit invalidation and replacement strategy
///////////////////////////////////////////////////////

  // note: it cannot happen that we get an invalidation + a cl replacement
  // in the same cycle as these requests arrive via the same interface
  // flushes take precedence over invalidations (it is ok if we ignore
  // the inval since the cache is cleared anyway)

  assign flush_cnt_d = (flush_done) ? '0               :
                       (flush_en)   ?  flush_cnt_q + 1 :
                                       flush_cnt_q;

  assign flush_done  = (flush_cnt_q==(ICACHE_NUM_WORDS-1));

  // invalidation/clearing address
  // flushing takes precedence over invals
  assign vld_addr = (flush_en)       ? flush_cnt_q        :
                    (inv_en)         ? mem_rtrn_i.inv.idx[ICACHE_INDEX_WIDTH-1:ICACHE_OFFSET_WIDTH] :
                                       cl_index;

  assign vld_req  = (flush_en || cache_rden)        ? '1                                    :
                    (mem_rtrn_i.inv.all && inv_en)  ? '1                                    :
                    (mem_rtrn_i.inv.vld && inv_en)  ? icache_way_bin2oh(mem_rtrn_i.inv.way) :
                                                      repl_way_oh_q;

  assign vld_wdata = (cache_wren) ? '1 : '0;

  assign vld_we    = (cache_wren | inv_en | flush_en);
  // assign vld_req   = (vld_we | cache_rden);


  // chose random replacement if all are valid
  assign update_lfsr   = cache_wren & all_ways_valid;
  assign repl_way      = (all_ways_valid) ? rnd_way : inv_way;
  assign repl_way_oh_d = (cmp_en_q) ? icache_way_bin2oh(repl_way) : repl_way_oh_q;

  // enable signals for memory arrays
  assign cl_req   = (cache_rden) ? '1            :
                    (cache_wren) ? repl_way_oh_q :
                                   '0;
  assign cl_we    = cache_wren;


  // find invalid cache line
  lzc #(
    .WIDTH ( ICACHE_SET_ASSOC )
  ) i_lzc (
    .in_i    ( ~vld_rdata     ),
    .cnt_o   ( inv_way        ),
    .empty_o ( all_ways_valid )
  );

  // generate random cacheline index
  lfsr_8bit #(
    .WIDTH (ICACHE_SET_ASSOC)
  ) i_lfsr (
    .clk_i          ( clk_i       ),
    .rst_ni         ( rst_ni      ),
    .en_i           ( update_lfsr ),
    .refill_way_oh  (             ),
    .refill_way_bin ( rnd_way     )
  );


///////////////////////////////////////////////////////
// tag comparison, hit generation
///////////////////////////////////////////////////////

  logic [$clog2(ICACHE_SET_ASSOC)-1:0] hit_idx;

  for (genvar i=0;i<ICACHE_SET_ASSOC;i++) begin : gen_tag_cmpsel
    assign cl_hit[i] = (cl_tag_rdata[i] == cl_tag_d) & vld_rdata[i];
    assign cl_sel[i] = cl_rdata[i][{cl_offset_q,3'b0} +: FETCH_WIDTH];
  end


  lzc #(
    .WIDTH ( ICACHE_SET_ASSOC )
  ) i_lzc_hit (
    .in_i    ( cl_hit  ),
    .cnt_o   ( hit_idx ),
    .empty_o (         )
  );

  assign dreq_o.data = (cmp_en_q) ? cl_sel[hit_idx] :
                                    mem_rtrn_i.data[{cl_offset_q,3'b0} +: FETCH_WIDTH];

///////////////////////////////////////////////////////
// memory arrays and regs
///////////////////////////////////////////////////////


  logic [ICACHE_TAG_WIDTH:0] cl_tag_valid_rdata [ICACHE_SET_ASSOC-1:0];

  for (genvar i = 0; i < ICACHE_SET_ASSOC; i++) begin : gen_sram
    // Tag RAM
    sram #(
      // tag + valid bit
      .DATA_WIDTH ( ICACHE_TAG_WIDTH+1 ),
      .NUM_WORDS  ( ICACHE_NUM_WORDS   )
    ) tag_sram (
      .clk_i     ( clk_i                    ),
      .rst_ni    ( rst_ni                   ),
      .req_i     ( vld_req[i]               ),
      .we_i      ( vld_we                   ),
      .addr_i    ( vld_addr                 ),
      // we can always use the saved tag here since it takes a
      // couple of cycle until we write to the cache upon a miss
      .wdata_i   ( {vld_wdata[i], cl_tag_q} ),
      .be_i      ( '1                       ),
      .rdata_o   ( cl_tag_valid_rdata[i]    )
    );

    assign cl_tag_rdata[i] = cl_tag_valid_rdata[i][ICACHE_TAG_WIDTH-1:0];
    assign vld_rdata[i]    = cl_tag_valid_rdata[i][ICACHE_TAG_WIDTH];

    // Data RAM
    sram #(
      .DATA_WIDTH ( ICACHE_LINE_WIDTH ),
      .NUM_WORDS  ( ICACHE_NUM_WORDS  )
    ) data_sram (
      .clk_i     ( clk_i               ),
      .rst_ni    ( rst_ni              ),
      .req_i     ( cl_req[i]           ),
      .we_i      ( cl_we               ),
      .addr_i    ( cl_index            ),
      .wdata_i   ( mem_rtrn_i.data     ),
      .be_i      ( '1                  ),
      .rdata_o   ( cl_rdata[i]         )
    );
  end


  always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs
    if(!rst_ni) begin
      cl_tag_q      <= '0;
      flush_cnt_q   <= '0;
      vaddr_q       <= '0;
      cmp_en_q      <= '0;
      cache_en_q    <= '0;
      flush_q       <= '0;
      state_q       <= IDLE;
      cl_offset_q   <= '0;
      repl_way_oh_q <= '0;
    end else begin
      cl_tag_q      <= cl_tag_d;
      flush_cnt_q   <= flush_cnt_d;
      vaddr_q       <= vaddr_d;
      cmp_en_q      <= cmp_en_d;
      cache_en_q    <= cache_en_d;
      flush_q       <= flush_d;
      state_q       <= state_d;
      cl_offset_q   <= cl_offset_d;
      repl_way_oh_q <= repl_way_oh_d;
    end
  end

///////////////////////////////////////////////////////
// assertions
///////////////////////////////////////////////////////

//pragma translate_off
`ifndef VERILATOR
  repl_inval0: assert property (
    @(posedge clk_i) disable iff (!rst_ni) cache_wren |-> !(mem_rtrn_i.inv.all | mem_rtrn_i.inv.vld))
      else $fatal(1,"[l1 icache] cannot replace cacheline and invalidate cacheline simultaneously");

  repl_inval1: assert property (
    @(posedge clk_i) disable iff (!rst_ni) (mem_rtrn_i.inv.all | mem_rtrn_i.inv.vld) |-> !cache_wren)
      else $fatal(1,"[l1 icache] cannot replace cacheline and invalidate cacheline simultaneously");

  invalid_state: assert property (
    @(posedge clk_i) disable iff (!rst_ni) (state_q inside {FLUSH, IDLE, READ, MISS, TLB_MISS, KILL_ATRANS, KILL_MISS}))
      else $fatal(1,"[l1 icache] fsm reached an invalid state");

  hot1: assert property (
    @(posedge clk_i) disable iff (!rst_ni) (!inv_en) |-> cache_rden |=> cmp_en_q |-> $onehot0(cl_hit))
      else $fatal(1,"[l1 icache] cl_hit signal must be hot1");

  // this is only used for verification!
  logic                                    vld_mirror[wt_cache_pkg::ICACHE_NUM_WORDS-1:0][ariane_pkg::ICACHE_SET_ASSOC-1:0];
  logic [ariane_pkg::ICACHE_TAG_WIDTH-1:0] tag_mirror[wt_cache_pkg::ICACHE_NUM_WORDS-1:0][ariane_pkg::ICACHE_SET_ASSOC-1:0];
  logic [ariane_pkg::ICACHE_SET_ASSOC-1:0] tag_write_duplicate_test;

  always_ff @(posedge clk_i or negedge rst_ni) begin : p_mirror
    if(!rst_ni) begin
      vld_mirror <= '{default:'0};
      tag_mirror <= '{default:'0};
    end else begin
      for (int i = 0; i < ICACHE_SET_ASSOC; i++) begin
        if(vld_req[i] & vld_we) begin
          vld_mirror[vld_addr][i] <= vld_wdata[i];
          tag_mirror[vld_addr][i] <= cl_tag_q;
        end
      end
    end
  end

  for (genvar i = 0; i < ICACHE_SET_ASSOC; i++) begin : gen_tag_dupl
    assign tag_write_duplicate_test[i] = (tag_mirror[vld_addr][i] == cl_tag_q) & vld_mirror[vld_addr][i] & (|vld_wdata);
  end

  tag_write_duplicate: assert property (
    @(posedge clk_i) disable iff (!rst_ni) |vld_req |-> vld_we |-> !(|tag_write_duplicate_test))
      else $fatal(1,"[l1 icache] cannot allocate a CL that is already present in the cache");


 initial begin
  // assert wrong parameterizations
  assert (ICACHE_INDEX_WIDTH<=12)
    else $fatal(1,"[l1 icache] cache index width can be maximum 12bit since VM uses 4kB pages");
 end
`endif
//pragma translate_on

endmodule // wt_icache
